Data Structure and
Characteristics
# Load and examine the Titanic dataset
titanic_raw <- read_csv('/Users/roberto/Desktop/titanic_project/train (2).csv',
show_col_types = FALSE)
# Display dataset dimensions and structure
cat("Dataset Dimensions:", nrow(titanic_raw), "observations,", ncol(titanic_raw), "variables\n")
Dataset Dimensions: 891 observations, 12 variables
glimpse(titanic_raw)
Rows: 891
Columns: 12
$ PassengerId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
$ Survived <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
$ Pclass <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
$ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
$ Sex <chr> "male", "female", "female", "female", "male", "male", "mal…
$ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
$ SibSp <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
$ Parch <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
$ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
$ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
$ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C…
$ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
The dataset contains 891 passenger records with 12 variables,
representing a subset of the complete passenger manifest. Key variables
include survival status (binary), passenger class (ordinal), demographic
characteristics (age, sex), family composition (siblings/spouses,
parents/children), fare paid, and embarkation port.
Data Preprocessing
and Feature Engineering
# Comprehensive data preprocessing with theoretical justification
titanic <- titanic_raw %>%
mutate(
# Convert survival to factor for clarity
Survived = factor(Survived, levels = c(0, 1), labels = c("No", "Yes")),
# Convert passenger class to ordered factor (3rd class as reference)
Pclass = factor(Pclass, levels = c("3", "2", "1"), ordered = TRUE),
# Convert sex to factor
Sex = factor(Sex),
# Impute embarkation port with mode (Southampton)
# Justification: Southampton was the primary departure port (>70% of passengers)
Embarked = fct_na_value_to_level(factor(Embarked), level = "S"),
# Impute age with median (robust to outliers)
# Alternative: Multiple imputation would be more sophisticated but computationally intensive
Age_original = Age,
Age = if_else(is.na(Age), median(Age, na.rm = TRUE), Age),
Age_imputed = is.na(Age_original),
# Feature engineering: Family size as predictor of survival
# Hypothesis: Medium-sized families have optimal survival rates (cooperation vs. coordination difficulty)
FamilySize = SibSp + Parch + 1,
# Categorical family size for interpretation
FamilyCategory = case_when(
FamilySize == 1 ~ "Alone",
FamilySize %in% 2:4 ~ "Small",
FamilySize >= 5 ~ "Large"
)
)
# Document impact of imputation
age_comparison <- data.frame(
Statistic = c("Mean", "Median", "Std Dev", "Min", "Max"),
Original = c(mean(titanic_raw$Age, na.rm=T), median(titanic_raw$Age, na.rm=T),
sd(titanic_raw$Age, na.rm=T), min(titanic_raw$Age, na.rm=T),
max(titanic_raw$Age, na.rm=T)),
After_Imputation = c(mean(titanic$Age), median(titanic$Age),
sd(titanic$Age), min(titanic$Age), max(titanic$Age))
)
age_comparison %>%
kable(digits = 2, caption = "Impact of Age Imputation on Distribution",
booktabs = TRUE) %>%
kable_styling(latex_options = c("striped", "hold_position"))
Impact of Age Imputation on Distribution
|
Statistic
|
Original
|
After_Imputation
|
|
Mean
|
29.70
|
29.36
|
|
Median
|
28.00
|
28.00
|
|
Std Dev
|
14.53
|
13.02
|
|
Min
|
0.42
|
0.42
|
|
Max
|
80.00
|
80.00
|